Global

library(phyloseq)
library(ggplot2)
library(dplyr)
library(scales)
library(grid)
library(reshape2)
library(gridExtra)
library(vegan)
library(cowplot)
library(gtable)
library(pander)
library(tidyr)
library(psych)
## objects and functions that will be useful throughout this analysis

# Set the ggplot theme
theme_set(theme_bw())

# Color palette for stations
station_colors = c("red", "#ffa500", "#0080ff")

# Function to order date levels correctly, including Aug 11
order_dates_aug11 <- function(df) {
  df$Date <- factor(df$Date, 
    levels = c("6/16","6/30","7/8","7/14","7/21",
      "7/29","8/4","8/11","8/18","8/25","9/2","9/8","9/15",
      "9/23","9/29","10/6","10/15","10/20","10/27"))
  return(df)
}

# Function to order date levels correctly, not including Aug 11
order_dates <- function(df) {
 df$Date <- factor(df$Date,
   levels = c("6/16","6/30","7/8","7/14","7/21",
     "7/29","8/4", "8/18","8/25","9/2","9/8","9/15",
     "9/23","9/29","10/6","10/15","10/20","10/27"))
 return(df)
}

# Function to create a named list
# Args: a vector of strings
#
# Returns: a list with the names supplied in the vector
named_list <- function(...){
    names <- as.list(substitute(list(...)))[-1L]
    result <- list(...)
    names(result) <- names
    result
}

# Source some useful functions for data normalization
source("~/git_repos/MicrobeMiseq/R/miseqR.R")

Load Data

load("../../Data/formatted-data/erie-data.RData")

# Inspect erie phyloseq object
erie
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7192 taxa and 53 samples ]
## sample_data() Sample Data:       [ 53 samples by 32 sample variables ]
## tax_table()   Taxonomy Table:    [ 7192 taxa by 9 taxonomic ranks ]

Scale counts to an even depth by dividing by total reads and multiplying by the minimum library size.

## Scale reads in OTU table to even depth 
depth = 15000

erie_scale <- 
  erie %>%
    scale_reads(n = depth, round = "round") 

Figure 1: Bloom temporal dynamics

Create figure 1, showing pigment concentrations, toxin concentration, and proportion of cyanobacterial reads over time and stations.

# Import metadata file with nutrients, pigments and toxin
nutrient <- read.csv("../../Data/formatted-data/nutrient_cleaned.csv")

# Format nutrient data
nutrient_sub <- 
  nutrient %>%
    filter(!(Date %in% c("5/27", "6/10", "11/3"))) %>%
    order_dates_aug11() 

# Calculate relative abundance of Cyanobacteria at each date
cyano_abundance <- 
  erie_scale %>%
    tax_glom(taxrank = "Phylum") %>%                        # conglomerate OTUs to phylum level
    transform_sample_counts(function(x) {x/sum(x)} ) %>%    # transform to relative abundance
    subset_taxa(Phylum == "Cyanobacteria") %>%              # Subset to just Cyanobacteria
    psmelt() %>%                                            # melt phyloseq object
    rename(Cyanobacteria = Abundance) %>%
    select(Cyanobacteria, Date, Station)

# Merge cyanobacteria data with nutrient df
bloom_df <- 
  nutrient_sub %>%
    left_join(cyano_abundance, by = c("Station", "Date")) %>%
    mutate(Phycocyanin = ifelse(Phycocyanin > 80, 80, Phycocyanin)) %>%   # lower extreme values to plot better
    select(Station, Date, Phycocyanin, Chla, ParMC, Cyanobacteria) %>%
    melt(id.vars = c("Station", "Date")) %>%
    order_dates_aug11()
# Make a faceted ggplot of the four bloom variables over time and grouped by station
bloom_plots <- ggplot(bloom_df, 
  aes(x = Date, y = value, group = Station, color = Station, shape = Station)
) +
  facet_grid(variable~., scales = "free_y") +
  geom_point(size = 1.3) +
  geom_line(size = 1) + 
  ylab("") +
  scale_x_discrete(
    breaks = c("7/8", "8/4", "9/2", "10/6"),
    labels = c("Jul", "Aug", "Sep", "Oct"),
    drop = FALSE
  ) +
  scale_color_manual(values = station_colors) + 
  theme(
    strip.background = element_blank(),
    strip.text = element_text(size = 11),
    axis.title.x = element_blank()
  )

# function to extract a legend from a ggplot object
grab_legend <- function(a_ggplot) {
    tmp <- ggplot_gtable(ggplot_build(a_ggplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    legend
}

station_legend <- grab_legend(bloom_plots) 

bloom_plots

ggsave("../../Plots/Figure1.pdf", plot = bloom_plots, width = 6, height = 5) 

Figure 1 Statistics

par(mfrow = c(1,3))
hist(nutrient$Chla)
hist(nutrient$Phycocyanin)
hist(nutrient$ParMC)

These distributions are very non-normal. Let’s try log-scaling

par(mfrow = c(1,3))
hist(log(nutrient$Chla), xlab = "log Chla", main = "")
hist(log(nutrient$Phycocyanin), xlab = "log Phycocyanin", main = "")
hist(log(nutrient$ParMC), xlab = "log ParMC", main = "")

These look better, so we will use these transformed variables for our correlation tests. Since phycocyanin and parmc both have zeroes, we will add a constant to all values. We selected this constant to be small and to minimally impact the distribution shape

par(mfrow = c(1,2))

logChla <- log(nutrient$Chla)
logPhyco <- log(nutrient$Phycocyanin + 0.1) 
logParMC <- log(nutrient$ParMC + 0.1) 

hist(logPhyco, main = "", xlab = "log Phyco + 0.1")
hist(logParMC, main = "", xlab = "log ParMC + 0.1")

# lets add log-scaled versions of variables to phyloseq object

nutrient$logChla <- logChla
nutrient$logPhyco <- logPhyco
nutrient$logParMC <- logParMC

molec_samples <- as.vector(sample_data(erie)$SampleID)

nutrient_sub <- nutrient %>%
  select(logChla, logPhyco, logParMC, SampleID) %>%
  filter(SampleID %in% molec_samples)

sample_data(erie)$logChla <- nutrient_sub$logChla
sample_data(erie)$logPhyco <- nutrient_sub$logPhyco
sample_data(erie)$logParMC <- nutrient_sub$logParMC
par(mfrow = c(1,2))

plot(logChla, logPhyco, xlab = "log Chla", ylab = "log Phycocyanin") 

# Examine residuals from linear regrerssion
hist(lm(logChla~logPhyco)$residuals, xlab = "residuals", main = "")

# Calculate linear correlation between Chla and phycocyanin for all sites
cor.test(
  x = logChla, 
  y = logPhyco, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logChla and logPhyco
## t = 10.175, df = 55, p-value = 2.985e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6936187 0.8828031
## sample estimates:
##       cor 
## 0.8081294

It looks like there is a pretty close correlation between chl a and phycocyanin measurements. Assumptions about normality of model residuals are met when log-scaling both variables.

par(mfrow = c(1,2))
plot(logChla, logParMC)
plot(logPhyco, logParMC)

cor.test(
  x = logChla, 
  y = logParMC, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logChla and logParMC
## t = 6.2612, df = 55, p-value = 6.071e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4622297 0.7753393
## sample estimates:
##       cor 
## 0.6451002
cor.test(
  x = logPhyco, 
  y = logParMC, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logPhyco and logParMC
## t = 11.949, df = 55, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7565981 0.9089839
## sample estimates:
##       cor 
## 0.8496596

The statistical tests show that there are significant correlations between pigments and toxin, but the plots show that there are several dates in which the toxin levels are 0 but pigments are high, indicating that pigments are not always predictive of toxicity.

Do the nearshore and offshore sites have different bloom pigment concentrations?

nutrient %>%
  group_by(Shore) %>%
  summarise(median(Chla))
## # A tibble: 2 x 2
##       Shore median(Chla)
##      <fctr>        <dbl>
## 1 nearshore       16.615
## 2  offshore        5.860
wilcox.test(Chla ~ Shore, data = nutrient, alternative = "greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Chla by Shore
## W = 551, p-value = 0.0006685
## alternative hypothesis: true location shift is greater than 0

How is phycocyanin related to particulate microcystin levels in the early bloom versus the late bloom?

par(mfrow = c(1,2))
################ early bloom #############################

early <- nutrient %>%
  filter(Month %in% c("June", "July", "August")) 

# Fit model
fit_early <- lm(log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = early)

# Plot model
plot(x = log(early$ParMC + 0.1), y = log(early$Phycocyanin + 0.1))

# Check residuals to make sure they are normal
hist(fit_early$residuals, xlab = "residuals", main = "")

# summary of model
summary(fit_early)
## 
## Call:
## lm(formula = log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = early)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26150 -0.40679  0.06957  0.36975  1.77811 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -1.59181    0.18230  -8.732 1.76e-09 ***
## log(Phycocyanin + 0.1)  0.94592    0.07619  12.416 6.64e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7367 on 28 degrees of freedom
## Multiple R-squared:  0.8463, Adjusted R-squared:  0.8408 
## F-statistic: 154.2 on 1 and 28 DF,  p-value: 6.641e-13
par(mfrow = c(1,2))
################ late bloom #############################

late <- nutrient %>%
  filter(Month %in% c("September", "October")) 

# Plot model
plot(x = log(late$ParMC + 0.1), y = log(late$Phycocyanin + 0.1))

# Fit model
fit_late <- lm(log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = late)

# Check residuals to make sure they are normal
hist(fit_late$residuals, xlab = "residuals", main = "")

# summary of model
summary(fit_late)
## 
## Call:
## lm(formula = log(ParMC + 0.1) ~ log(Phycocyanin + 0.1), data = late)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.69001 -0.31917  0.07973  0.35119  1.07065 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.4707     0.1413 -10.411 1.41e-10 ***
## log(Phycocyanin + 0.1)   0.4646     0.0708   6.562 7.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.645 on 25 degrees of freedom
## Multiple R-squared:  0.6327, Adjusted R-squared:  0.618 
## F-statistic: 43.06 on 1 and 25 DF,  p-value: 7.114e-07

Based on these two linear models, we see that the slope estimate for phycocyanin is much lower in the late part of the bloom than the early part of the bloom.

Figure 2: Cyano OTUs

In this figure we will display in part A a barplot with the relative abundance of cyanobacteria genera over time and site. In part B we will break up these groups into OTUs and show lineplots for each non-rare OTU over time (mean rel abundance > 0.0001) and site.

## Select only cyano OTUs that have mean relative abundace > 0.0005
n = 15000
thresh = 0.0005

# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(erie_scale)/nsamples(erie_scale)

# Prune low abundance taxa using thresh as mean relative abundance
erie_prune_0001 <- 
  erie_scale %>%
    prune_taxa(tax_mean > thresh*n, .)


# Create a melted data frame of selected cyanobacteria OTUs
cyano_otus <- 
  erie_prune_0001 %>%
    transform_sample_counts(function(x) {x/sum(x)}) %>%
    subset_taxa(Class == "Cyanobacteria") %>%
    psmelt() %>%
    order_dates()
################# Plot A #######################

cyano_genus <-
  cyano_otus %>%
    group_by(Genus, Date, Station) %>%
    summarize(Abundance = sum(Abundance)) %>%
    arrange(Genus) %>%
    order_dates()


plot2a <- ggplot(cyano_genus, aes(x = Date, y = Abundance, fill = Genus)) +
  facet_grid(~Station) +
  geom_bar(stat = "identity") +
  ylab("rel. abundance") + 
  scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
    ) +
  scale_fill_manual(values = c("#755147", "#74b67f", "#3232ff" , "#753376", "#ff9a00")) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 9),
    plot.title = element_text(size = 10, face = "bold"),
    strip.background = element_blank()
  ) 

genus_legend <- grab_legend(plot2a)

plot2a <- plot2a + theme(legend.position = "none")
# Function to make a ggplot lineplot of an OTU's relative abundance over time
#
# Args:
#   df: a melted data frame generated by calling psmelt() on a phyloseq object. 
#       Contains an "Abundance" column for the OTU's abundance 
#   otu: the OTU to generate a lineplot for
#   taxrank: the taxonomic rank to appear in the plot title (e.g. "Genus")
# Returns:
#   a ggplot lineplot
plot_otus <- function(df, otu, taxrank) {
  ggplot(df, 
    aes(x = Date, y = Abundance, group = Station, color = Station, shape = Station)) +
    geom_point(size = 2) +
    geom_line(size = 0.7) +
    ggtitle(paste(df[1, taxrank], otu)) +
    ylab("rel. abund") + 
    scale_color_manual(values = station_colors) +
    scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
    ) +
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_text(size = 9),
      legend.position = "none",
      plot.title = element_text(size = 10, face = "bold")
    ) 
}



##################### Plot B #############################

cyano_otu_names <- as.list(levels(cyano_otus$Species))
names(cyano_otu_names) <- levels(cyano_otus$Species)


# Generate a lineplot for each cyanobacteria with mean relative abundance > 0.0001
cyano_otu_plots <- lapply(cyano_otu_names, 
  function(otu) {
    df_otu <- filter(cyano_otus, OTU == otu)
    plot <- plot_otus(df = df_otu, otu = otu, taxrank = "Genus") 
    return(plot)
  }
)

plot2b <- arrangeGrob(
  cyano_otu_plots$Otu00007, cyano_otu_plots$Otu00177, cyano_otu_plots$Otu00005,
  cyano_otu_plots$Otu00044, cyano_otu_plots$Otu00304, cyano_otu_plots$Otu00037, 
  cyano_otu_plots$Otu00147, cyano_otu_plots$Otu00193, cyano_otu_plots$Otu00049,
  ncol = 3, nrow = 3
)
##################### Compile plots #####################################

ggdraw() +
  draw_plot(plot2a, x = 0.03, y = 0.7, width = .8, height = 0.28) +
  draw_plot(genus_legend, x = 0.83, y = .76, width = .16, height = .18) + 
  draw_plot(plot2b, x = 0.03, y = 0.02, width = .8, height = 0.64) +
  draw_plot(station_legend, x = 0.80, y = .5, width = .18, height = .18) + 
  draw_plot_label(c("A", "B"), c(0.02, 0.02), c(0.97, 0.68), size = 14) 

ggsave("../../Plots/Figure2.pdf", plot = bloom_plots, width = 10, height = 8) 

Figure 2 statistics

What is the most abundant genus?

cyano_genus %>%
  group_by(Genus) %>%
  summarise(median(Abundance))
## # A tibble: 5 x 2
##           Genus median(Abundance)
##          <fctr>             <dbl>
## 1      Anabaena       0.000000000
## 2   Microcystis       0.024369748
## 3 Pseudanabaena       0.002794857
## 4 Synechococcus       0.038433382
## 5  unclassified       0.002680188

What if we look at each site separately?

cyano_genus %>%
  group_by(Genus, Station) %>%
  summarise(median(Abundance))
## Source: local data frame [15 x 3]
## Groups: Genus [?]
## 
##            Genus    Station median(Abundance)
##           <fctr>     <fctr>             <dbl>
## 1       Anabaena nearshore1      0.0000000000
## 2       Anabaena nearshore2      0.0000000000
## 3       Anabaena   offshore      0.0000000000
## 4    Microcystis nearshore1      0.0318316878
## 5    Microcystis nearshore2      0.0413849336
## 6    Microcystis   offshore      0.0070832091
## 7  Pseudanabaena nearshore1      0.0026464855
## 8  Pseudanabaena nearshore2      0.0043612088
## 9  Pseudanabaena   offshore      0.0009692812
## 10 Synechococcus nearshore1      0.0327130113
## 11 Synechococcus nearshore2      0.0382854263
## 12 Synechococcus   offshore      0.0476329846
## 13  unclassified nearshore1      0.0042086796
## 14  unclassified nearshore2      0.0045272706
## 15  unclassified   offshore      0.0019813605

Which Cyanobacteria are associated with each other? Spearman’s rho correlations are shown and significant correlations are bolded.

# Create a phyloseq object of only cyanobacteria OTUs
cyano_physeq <- 
  erie_prune_0001 %>%
    transform_sample_counts(function(x) {x/sum(x)}) %>%
    subset_taxa(Class == "Cyanobacteria")

taxa_names(cyano_physeq) <- paste(
  tax_table(cyano_physeq)[ , "Genus"], 
  taxa_names(cyano_physeq)
)

# Run pairwise spearman correlation tests between all cyanobacteria genera.
cyano_corrs_spearman <- corr.test(
  t(otu_table(cyano_physeq)), 
  use = "complete", 
  method = "spearman", 
  adjust = "fdr"
)

emphasize.strong.cells(which(cyano_corrs_spearman$p < 0.05, arr.ind = TRUE))
pander(signif(cyano_corrs_spearman$r, digits = 2))
Table continues below
  Microcystis Otu00005 Synechococcus Otu00007
Microcystis Otu00005 1 -0.063
Synechococcus Otu00007 -0.063 1
Pseudanabaena Otu00037 0.77 0.13
Synechococcus Otu00044 0.22 0.46
unclassified Otu00049 0.63 0.18
Synechococcus Otu00147 -0.23 0.6
Synechococcus Otu00177 0.49 0.088
Anabaena Otu00193 0.56 -0.052
unclassified Otu00304 0.57 0.027
Table continues below
  Pseudanabaena Otu00037 Synechococcus Otu00044
Microcystis Otu00005 0.77 0.22
Synechococcus Otu00007 0.13 0.46
Pseudanabaena Otu00037 1 0.28
Synechococcus Otu00044 0.28 1
unclassified Otu00049 0.68 0.52
Synechococcus Otu00147 0.086 0.31
Synechococcus Otu00177 0.6 0.25
Anabaena Otu00193 0.26 0.092
unclassified Otu00304 0.61 -0.022
Table continues below
  unclassified Otu00049 Synechococcus Otu00147
Microcystis Otu00005 0.63 -0.23
Synechococcus Otu00007 0.18 0.6
Pseudanabaena Otu00037 0.68 0.086
Synechococcus Otu00044 0.52 0.31
unclassified Otu00049 1 -0.16
Synechococcus Otu00147 -0.16 1
Synechococcus Otu00177 0.74 -0.27
Anabaena Otu00193 0.47 -0.51
unclassified Otu00304 0.62 -0.2
Table continues below
  Synechococcus Otu00177 Anabaena Otu00193
Microcystis Otu00005 0.49 0.56
Synechococcus Otu00007 0.088 -0.052
Pseudanabaena Otu00037 0.6 0.26
Synechococcus Otu00044 0.25 0.092
unclassified Otu00049 0.74 0.47
Synechococcus Otu00147 -0.27 -0.51
Synechococcus Otu00177 1 0.35
Anabaena Otu00193 0.35 1
unclassified Otu00304 0.45 0.44
  unclassified Otu00304
Microcystis Otu00005 0.57
Synechococcus Otu00007 0.027
Pseudanabaena Otu00037 0.61
Synechococcus Otu00044 -0.022
unclassified Otu00049 0.62
Synechococcus Otu00147 -0.2
Synechococcus Otu00177 0.45
Anabaena Otu00193 0.44
unclassified Otu00304 1

Figure 3: Alpha diversity

# My own subsetting function, similar to phyloseq::subset_taxa, except taxa can 
# be passed as arguments within functions without weird environment errors
#
# Args:
#   physeq: a phyloseq object
#   taxrank: taxonomic rank to filter on
#   taxa: a vector of taxa groups to filter on
#
# Returns: 
#   a phyloseq object subsetted to the x taxa in taxrank
my_subset_taxa <- function(physeq, taxrank, taxa) {
  physeq_tax_sub <- tax_table(physeq)[tax_table(physeq)[ , taxrank] %in% taxa, ]
  tax_table(physeq) <- physeq_tax_sub
  return(physeq)
}

Here we estimate alpha diversity by sampling with replacement 100x and averaging OTU richness and Simpson’s E over each of the trials

# Initialize parameters
trials = 100
min_lib = min(sample_sums(erie)) # Depth we are rarefying to

# Groups to estimate alpha diversity for 
mytaxa <- c(
  "Bacteria", "NcBacteria", "Actinobacteria", "Alphaproteobacteria",
  "Betaproteobacteria", "Bacteroidetes", "Gammaproteobacteria", 
  "Deltaproteobacteria", "Verrucomicrobia"
)
names(mytaxa) <- mytaxa

# Taxonomic ranks of mytaxa
mytaxa_taxrank <- c(
  "Kingdom", "Class", "Phylum", "Class", "Class", 
  "Phylum", "Class", "Class", "Phylum"
)
names(mytaxa_taxrank) <- mytaxa

# Data frame to hold alpha diversity estimates over trials
alphadiv_df <- data.frame(matrix(nrow = nsamples(erie), ncol = trials))

# Initialize empty df's for richness and evenness of all taxa in mytaxa
richness <- lapply(mytaxa, function(x) {return(alphadiv_df)} )
simpson <- lapply(mytaxa, function(x) {return(alphadiv_df)} )

alphadiv_list <- list(richness = richness, simpson = simpson)


# It is always important to set a seed when you subsample so your result is replicable 
set.seed(3)

# Run trials to subsample and estimate diversity
for (i in 1:trials) {
  # Subsample
  rarefied_physeq <- rarefy_even_depth(erie, sample.size = min_lib, verbose = FALSE, replace = TRUE)
  
  # Generate alpha-diversity estimates for each taxonomic group
  for (t in mytaxa) {
    # Subset physeq object to taxa in mytaxa
    if (t != "NcBacteria") {
      physeq_sub <- my_subset_taxa(
        physeq = rarefied_physeq, 
        taxrank = mytaxa_taxrank[t], 
        taxa = t
      )
    } else {
      physeq_sub <- subset_taxa(physeq = rarefied_physeq, Class != "Cyanobacteria")
    }
    
    # Calculate observed richness for that group and store value in a df column
    richness <- estimate_richness(physeq_sub, measures = "Observed")[ ,1]
    alphadiv_list$richness[[t]][ ,i] <- richness
     
    # Calculate Simpson's E for that group and store value in a df column
    alphadiv_list$simpson[[t]][ ,i] <- (estimate_richness(physeq_sub, measures = "InvSimpson")[ ,1]/richness)

  }
}
# Calculate the means of richness and inverse simpson from the 100 trials
alphadiv_est <- lapply(alphadiv_list, function(div_measure) {
    lapply(div_measure, function(taxa_group) {
        alpha_mean <- rowMeans(taxa_group)
        return(alpha_mean)
    })  
})

# Convert alphadiv_est richness and simpson's E lists into wide data frames
l <- lapply(alphadiv_est, function(x) {
  # convert from list to data.frame
  est_df <- plyr::ldply(.data = x, .fun = data.frame)
  names(est_df) <- c("Taxa", "Diversity")
  
  # Add in SampleID column and spread to wide format
  r <- est_df %>%
    mutate(SampleID = rep(sample_names(erie), length(mytaxa)))
  return(r)
})

# Merge sample metadata with these estimates
merge_dat <- data.frame(sample_data(erie)) %>%
  select(SampleID, logChla, pH, logPhyco, TP, Turbidity, Station, Date, Days) %>%
  mutate(logTP = log(TP)) %>%
  mutate(logTurb = log(Turbidity))

# Create a df with a "Diversity" column that includes richness and inv. simpson,
# and log-chl a values from erie sample_data
alpha_comb <- l$richness %>% 
  left_join(y = l$simpson, by = c("Taxa", "SampleID")) %>%   # Join the richness and inv_simp df's
  rename(Richness = Diversity.x, Simpson = Diversity.y) %>%  # rename columns to avoid confusion
  left_join(merge_dat, by = "SampleID") %>%                  # Join with merged nutrient data
  gather(key = "Alphadiv", value = "Estimate", Richness, Simpson) %>%
  order_dates()
# Function to test whether there is a linear relationship 
# between log chla and alpha diversity of a group
#
# Args:
#   df: a data frame with columns for diversity estimate and bloom variable (e.g. logChla, logPhyco)
#
# Returns:
#   a vector with the pvalue and R2 of the linear model
test_alphadiv_pp <- function(df, bloomvar) {
  
  df_sub <- na.omit(df[, c(bloomvar, "Estimate")])
  
  # Fit a linear model 
  fit <- lm(reformulate(termlabels = bloomvar, response = "Estimate"), data = df_sub)
  
  # Grab model outputs
  fit_pvalue <- summary(fit)$coef[2,4]
  fit_r2 <- summary(fit)$r.squared
  
  return(c(fit_pvalue, fit_r2))
}
# Function to make a ggplot scatterplot of logChla vs an alpha-diversity metric.
# If the pvalue is below 0.05, it will also plot the fitted line
#
# Args:
#   df: a melted data frame with a column called logChla and value for alpha-diversity
#   measure: Alpha-diversity measure (e.g. "InvSimpson" or "Observed")
#   group: Taxonomic group to plot (e.g. "Betaproteobacteria")
#   pvalue: pvalue from linear model returned by test_alphadiv_pp
#   r2: r2 from linear model returned by test_alphadiv_pp
#
# Returns:
#   a ggplot
make_alphadiv_plot <- function(df, bloomvar, measure, group, pvalue, p_x, r2) {
  
  g <- ggplot(df, aes_string(x = bloomvar, y = "Estimate")) + 
    geom_point() +
    ylab(measure) +
    ggtitle(group) 
  
  
  # Since we rounded to 3 sigfigs, estimates of 0 need to actually say "p < 0.001"
  if (pvalue == 0) {
    g <- g + annotate(
      "text", 
      x = p_x,
      y = max(df$Estimate) - 0.03*max(df$Estimate), 
      size = 3, 
      label = "p < 0.001"
    )
  } else if (pvalue < 0.05 & pvalue != 0) {
    g <- g + annotate(
        "text", 
        x = p_x,
        y = max(df$Estimate) - 0.03*max(df$Estimate), 
        size = 3, 
        label = paste("p =", pvalue)
      ) 
  }
  
  if (pvalue < 0.05) {
    g <- g + 
      annotate(
        "text",
        x = p_x,
        y = max(df$Estimate) - 0.08*max(df$Estimate),
        size = 3,
        label = paste("R2 =", r2)
      ) +
      geom_smooth(method = "lm", size = 1)
  }
  
  return(g)
                     
}
divs <- named_list("Richness", "Simpson")

# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
  alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
  lapply(mytaxa, function(t) {
    alpha_sub <- alpha_sub %>% filter(Taxa == t)
    # Fit linear model 
    fit <- test_alphadiv_pp(alpha_sub, "logChla")
    return(fit)
  })
})

# Unlist
alpha_results <- lapply(alpha_models, function(x) {
  f <- x %>% 
     unlist(use.names = FALSE) %>%
     matrix(
        nrow = length(mytaxa), 
        ncol = 2, 
        byrow = TRUE, 
        dimnames = list(mytaxa, c("pvalue","r2"))
    )
    
  # fdr correction on pvalues
  f[ ,1] <- p.adjust(f[ ,1], method = "fdr") 
  # Round to three significant digits
  f <- round(f, digits = 3)
})


## Make plots for Simpson's E vs log chla
simp_plots <- list()

for (i in 1:length(mytaxa)) {
  df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
    filter(Alphadiv == "Simpson") 
  simp_plots[[i]] <- make_alphadiv_plot(
    df = df, 
    bloomvar = "logChla",
    measure = "Simpson's E", 
    group = mytaxa[i],
    pvalue = alpha_results$Simpson[i, 1],
    p_x = 0.5,
    r2 = alpha_results$Simpson[i, 2]
  )
}

# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
  filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ logChla + I(logChla^2), data = alphaproteo)
summary(quad_fit)
## 
## Call:
## lm(formula = Estimate ~ logChla + I(logChla^2), data = alphaproteo)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.097202 -0.036048 -0.004634  0.031577  0.141354 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.097811   0.030413   3.216 0.002280 ** 
## logChla       0.101997   0.028571   3.570 0.000800 ***
## I(logChla^2) -0.021940   0.006266  -3.501 0.000984 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05017 on 50 degrees of freedom
## Multiple R-squared:  0.2035, Adjusted R-squared:  0.1716 
## F-statistic: 6.387 on 2 and 50 DF,  p-value: 0.003387
simp_plots[[4]] <- simp_plots[[4]] + 
  stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
  annotate(
    "text",
    x = 0.9,
    y = .34,
    size = 3,
    label = paste("p < 0.001") 
  ) + 
  annotate(
    "text",
    x = 0.9,
    y = .31,
    size = 3,
    label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
  ) 


## Make plots for observed richness vs log chla
rich_plots <- list()

for (i in 1:length(mytaxa)) {
  df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
    filter(Alphadiv == "Richness")
  rich_plots[[i]] <- make_alphadiv_plot(
    df = df, 
    bloomvar = "logChla",
    measure = "Obs. Richness", 
    group = mytaxa[i],
    pvalue = alpha_results$Richness[i, 1],
    p_x = 0.5,
    r2 = alpha_results$Richness[i, 2]
  )
}

Seasonal alpha diversity plots

alphadiv_ncbacteria_simpson <- alpha_comb %>%
  filter(Taxa == "NcBacteria") %>%
  filter(Alphadiv == "Simpson") %>%
  order_dates()

seasonE <- ggplot(alphadiv_ncbacteria_simpson, aes(x = Date, y = Estimate, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) +
  ylab("Simpson's E") +
  xlab("") +
  scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
  ) + 
  theme(legend.position = "none") +
  ggtitle("Nc-Bacteria")

alphadiv_ncbacteria_rich <- alpha_comb %>%
  filter(Taxa == "NcBacteria") %>%
  filter(Alphadiv == "Richness") %>%
  order_dates()
  
seasonRich <- ggplot(alphadiv_ncbacteria_rich, aes(x = Date, y = Estimate, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) +
  ylab("Obs. Richness") +
  xlab("") +
  scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
  ) + 
  theme(legend.position = "none") +
  ggtitle("Nc-Bacteria")
## Arrange plots for final figure
ggdraw() +
  draw_plot(seasonRich,      x = 0.01,  y = 0.73, width = 0.48, height = 0.24) + 
  draw_plot(rich_plots[[4]], x = 0.01,  y = 0.49, width = 0.48, height = 0.24) +  
  draw_plot(rich_plots[[5]], x = 0.01,  y = 0.25, width = 0.48, height = 0.24) +
  draw_plot(rich_plots[[6]], x = 0.01,  y = 0.01, width = 0.48, height = 0.24) +
  draw_plot(seasonE,         x = 0.52,  y = 0.73, width = 0.48, height = 0.24) + 
  draw_plot(simp_plots[[4]], x = 0.52,  y = 0.49, width = 0.48, height = 0.24) +  
  draw_plot(simp_plots[[5]], x = 0.52,  y = 0.25, width = 0.48, height = 0.24) +
  draw_plot(simp_plots[[6]], x = 0.52,  y = 0.01, width = 0.48, height = 0.24) +
  draw_plot_label(c("A", "B", "C", "D", "E", "F", "G", "H"),  c(0, 0, 0, 0, .52, .52, .52, .52), c(0.99, 0.74, 0.5, 0.26), size = 14)

ggsave("../../Plots/Figure3.pdf", height = 10, width = 7)

Other bloom measurements

cor.test(nutrient$logChla, nutrient$pH)
## 
##  Pearson's product-moment correlation
## 
## data:  nutrient$logChla and nutrient$pH
## t = 9.149, df = 52, p-value = 2.044e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6554519 0.8701501
## sample estimates:
##       cor 
## 0.7853757
######## pH ###########

# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
  alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
  lapply(mytaxa, function(t) {
    alpha_sub <- alpha_sub %>% filter(Taxa == t)
    # Fit linear model 
    fit <- test_alphadiv_pp(alpha_sub, "pH")
    return(fit)
  })
})

# Unlist
alpha_results <- lapply(alpha_models, function(x) {
  f <- x %>% 
     unlist(use.names = FALSE) %>%
     matrix(
        nrow = length(mytaxa), 
        ncol = 2, 
        byrow = TRUE, 
        dimnames = list(mytaxa, c("pvalue","r2"))
    )
    
  # fdr correction on pvalues
  f[ ,1] <- p.adjust(f[ ,1], method = "fdr") 
  # Round to three significant digits
  f <- round(f, digits = 3)
})

simp_plots_ph <- list()

for (i in 1:length(mytaxa)) {
  df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
    filter(Alphadiv == "Simpson") 
  simp_plots_ph[[i]] <- make_alphadiv_plot(
    df = df, 
    bloomvar = "pH",
    measure = "Simpson's E", 
    group = mytaxa[i],
    pvalue = alpha_results$Simpson[i, 1],
    p_x = 8.1,
    r2 = alpha_results$Simpson[i, 2]
  )
}

# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
  filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ pH + I(pH^2), data = alphaproteo)
summary(quad_fit)
## 
## Call:
## lm(formula = Estimate ~ pH + I(pH^2), data = alphaproteo)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.125636 -0.031979 -0.003435  0.029431  0.132886 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.55867    3.55752  -3.530 0.000941 ***
## pH            3.01789    0.83005   3.636 0.000686 ***
## I(pH^2)      -0.17817    0.04835  -3.685 0.000591 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04824 on 47 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.2789, Adjusted R-squared:  0.2482 
## F-statistic: 9.087 on 2 and 47 DF,  p-value: 0.0004607
simp_plots_ph[[4]] <- simp_plots_ph[[4]] + 
  stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
  annotate(
    "text",
    x = 8.1,
    y = .34,
    size = 3,
    label = paste("p < 0.001") 
  ) + 
  annotate(
    "text",
    x =  8.1,
    y = .31,
    size = 3,
    label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
  ) 
grid.arrange(simp_plots_ph[[4]], simp_plots_ph[[5]], simp_plots_ph[[6]])

############# Phycocyanin #############
# apply alpha div test to each diversity index for each group
alpha_models <- lapply(divs, function(d) {
  alpha_sub <- alpha_comb %>% filter(Alphadiv == d)
  lapply(mytaxa, function(t) {
    alpha_sub <- alpha_sub %>% filter(Taxa == t)
    # Fit linear model 
    fit <- test_alphadiv_pp(alpha_sub, "logPhyco")
    return(fit)
  })
})

# Unlist
alpha_results <- lapply(alpha_models, function(x) {
  f <- x %>% 
     unlist(use.names = FALSE) %>%
     matrix(
        nrow = length(mytaxa), 
        ncol = 2, 
        byrow = TRUE, 
        dimnames = list(mytaxa, c("pvalue","r2"))
    )
    
  # fdr correction on pvalues
  f[ ,1] <- p.adjust(f[ ,1], method = "fdr") 
  # Round to three significant digits
  f <- round(f, digits = 3)
})

simp_plots_phyco <- list()

for (i in 1:length(mytaxa)) {
  df <- filter(alpha_comb, Taxa == mytaxa[i]) %>%
    filter(Alphadiv == "Simpson") 
  simp_plots_phyco[[i]] <- make_alphadiv_plot(
    df = df, 
    bloomvar = "logPhyco",
    measure = "Simpson's E", 
    group = mytaxa[i],
    pvalue = alpha_results$Simpson[i, 1],
    p_x = -1.5,
    r2 = alpha_results$Simpson[i, 2]
  )
}

# Fit quadratic to alphaproteo evenness
alphaproteo <- filter(alpha_comb, Taxa == "Alphaproteobacteria") %>%
  filter(Alphadiv == "Simpson")
quad_fit <- lm(Estimate ~ logPhyco + I(logPhyco^2), data = alphaproteo)
summary(quad_fit)
## 
## Call:
## lm(formula = Estimate ~ logPhyco + I(logPhyco^2), data = alphaproteo)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.116487 -0.036608 -0.009642  0.033543  0.136508 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.209797   0.009189  22.831   <2e-16 ***
## logPhyco       0.005633   0.007166   0.786   0.4355    
## I(logPhyco^2) -0.004930   0.002284  -2.158   0.0357 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05253 on 50 degrees of freedom
## Multiple R-squared:  0.1268, Adjusted R-squared:  0.09183 
## F-statistic: 3.629 on 2 and 50 DF,  p-value: 0.03375
simp_plots_phyco[[4]] <- simp_plots_phyco[[4]] + 
  stat_smooth(method = "lm", formula = y ~ poly(x, 2)) +
  annotate(
    "text",
    x = -1.5,
    y = .34,
    size = 3,
    label = paste("p = 0.035") 
  ) + 
  annotate(
    "text",
    x =  -1.5,
    y = .31,
    size = 3,
    label = paste("R2 =", round(summary(quad_fit)$adj.r.squared, 3))
  ) 

grid.arrange(simp_plots_phyco[[4]], simp_plots_phyco[[5]], simp_plots_phyco[[6]])

Figure 4: Composition analyses

# Subset to non-cyanobacteria and scale internally
ncbacteria <- 
  erie %>%
    subset_taxa(Class != "Cyanobacteria") %>%
    scale_reads(round = "round")


# Generate pcoa scores for each subset
ncbact_pcoa <- ordinate(
      physeq = ncbacteria,
      method = "PCoA",
      distance = "bray"
)

# Generate a df to plot pcoa for each subset
pcoa_df <- plot_ordination(
  physeq = ncbacteria,
  axes = 1:3,
  ordination = ncbact_pcoa,
  justDF = TRUE
)

pcoa_df <- pcoa_df %>% 
  rename(PC1 = Axis.1, PC2 = Axis.2, PC3 = Axis.3) %>%
  order_dates() %>%
  # Flip orientation of PC2 for Cyanobacteria (does not affect interpretation)
  mutate(PC2 = -PC2) 


# Generate relative, lingoes-corrected eigenvalues for PC1 and PC2
pcs <- c(
  PC1 = signif(ncbact_pcoa$values$Rel_corr_eig[1]*100, 3),
  PC2 = signif(ncbact_pcoa$values$Rel_corr_eig[2]*100, 3),
  PC3 = signif(ncbact_pcoa$values$Rel_corr_eig[3]*100, 3)
)
# Function to create a plot of time (x-axis) vs PC scores (y-axis)
plot_pcts <- function(df, pc, eigs) {
  ggplot(df, 
    aes_string(x = "Date", y = pc, group = "Station", color = "Station", shape = "Station")) +
      geom_point(size = 2.5) +
      geom_line(size = 1.1) + 
      scale_color_manual(values = station_colors) +
      scale_x_discrete(
        breaks = c("7/8", "8/4", "9/2", "10/6"),
        labels = c("Jul", "Aug", "Sep", "Oct"),
        drop = FALSE
      ) +  
      ylab(paste(pc, " ", eigs[pc], "%")) +
      theme(
        axis.title.x = element_blank(),
        plot.title = element_text(face = "bold", size = 16),
        legend.position = "none",
        plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm")
      ) 
}

# Non-cyano PC time series plots
ncbact_pcs <- named_list("PC1", "PC2", "PC3")

ncbact_pc_plots <- lapply(ncbact_pcs,
  function(x) {
    plot_pcts(pcoa_df, x, eigs = pcs)
  }
)
# Plot for pH
ph_plot <- ggplot(pcoa_df, aes(x = PC1, y = pH)) +
  geom_point(size = 2.5, aes(shape = Station, color = Station)) +
  scale_color_manual(values = station_colors) +
  geom_smooth(method = "lm", color = "black") +
  theme(legend.position = "none")

# Plot for Temperature
temp_plot <- ggplot(pcoa_df, aes(x = PC2, y = Temp)) +
  geom_point(size = 2.5, aes(shape = Station, color = Station)) +
  scale_color_manual(values = station_colors) +
  geom_smooth(method = "lm", color = "black") +
  theme(legend.position = "none") +
  ylab("Temperature")

# Plot for SpCond
spcond_plot <- ggplot(pcoa_df, aes(x = PC3, y = SpCond)) +
  geom_point(size = 2.5, aes(shape = Station, color = Station)) +
  scale_color_manual(values = station_colors) +
  geom_smooth(method = "lm", color = "black") +
  theme(legend.position = "none") +
  ylab("Specific Conductivity")

Compiled plot

ggdraw() +
  ## non-cyano
  draw_plot(ncbact_pc_plots$PC1, x = 0.02, y = 0.5, width = 0.3, height = 0.42) +
  draw_plot(ncbact_pc_plots$PC2, x = 0.34, y = 0.5, width = 0.3, height = 0.42) +
  draw_plot(ncbact_pc_plots$PC3, x = 0.66, y = 0.5, width = 0.3, height = 0.42) +
  draw_plot(ph_plot, x = 0.02, y = 0.0, width = 0.3, height = 0.42) +
  draw_plot(temp_plot, x = 0.34, y = 0.0, width = 0.3, height = 0.42) +
  draw_plot(spcond_plot, x = 0.66, y = 0.0, width = 0.3, height = 0.42) 

Figure 4 Statistics

How many PCoA dimensions should we examine?

plot(1:nrow(ncbact_pcoa$values), ncbact_pcoa$values$Rel_corr_eig, xlab = 
       "Principal Coordinate", ylab = "Relative eigenvalue")

How good is my PCoA in three dimensions?

pcoa_dist <- as.vector(dist(ncbact_pcoa$vectors[,1:3]))
bray_dist <- as.vector(phyloseq::distance(ncbacteria, method = "bray"))

plot(bray_dist, pcoa_dist) +
 abline(lm(pcoa_dist~bray_dist), col = "blue")

## numeric(0)
summary(lm(pcoa_dist~bray_dist))
## 
## Call:
## lm(formula = pcoa_dist ~ bray_dist)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.307054 -0.027616  0.007518  0.036637  0.124308 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.248702   0.005806  -42.83   <2e-16 ***
## bray_dist    1.237353   0.011810  104.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05377 on 1376 degrees of freedom
## Multiple R-squared:  0.8886, Adjusted R-squared:  0.8885 
## F-statistic: 1.098e+04 on 1 and 1376 DF,  p-value: < 2.2e-16

What is the max bray-curtis between time points for each site?

# Histogram of bray-curtis distance
hist(bray_dist)

# Nearshore 1 
n1 <- subset_samples(erie_scale, Station == "nearshore1")
n1_bdist <- phyloseq::distance(n1, method = "bray")
max(as.matrix(n1_bdist)[1,])
## [1] 0.783699
# Nearshore 2
n2 <- subset_samples(erie_scale, Station == "nearshore2")
n2_bdist <- phyloseq::distance(n2, method = "bray")
max(as.matrix(n2_bdist)[1,])
## [1] 0.8121858
# Offshore 
o <- subset_samples(erie_scale, Station == "offshore")
o_bdist <- phyloseq::distance(o, method = "bray")
max(as.matrix(o_bdist)[1,])
## [1] 0.6426857

What is the bray-curtis between the first and last time points

# Nearshore 1 
n1 <- subset_samples(erie_scale, Station == "nearshore1" & Date %in% c("6/16", "10/27"))
n1_bdist <- phyloseq::distance(n1, method = "bray")
n1_bdist
##           E0012_CNA
## E0132_CNA 0.4601162
# Nearshore 2
n2 <- subset_samples(erie_scale, Station == "nearshore2" & Date %in% c("6/16", "10/27"))
n2_bdist <- phyloseq::distance(n2, method = "bray")
n2_bdist
##           E0010_CNA
## E0130_CNA 0.4530431
# Offshore 
o <- subset_samples(erie_scale, Station == "offshore" & Date %in% c("6/16", "10/27"))
o_bdist <- phyloseq::distance(o, method = "bray")
o_bdist
##           E0014_CNA
## E0134_CNA 0.3639581

Linear models

# Variables to include in cyano models
cyano_vars <- c("Nitrate", "SRP", "Temp", "H2O2", "SpCond", "Ammonia", "Turbidity", "TP", "Days")


# Examine distributions of potential env variables to normalize
par(mfrow = c(3,3))
for (var in cyano_vars) {
  hist(nutrient[ ,var], main = "", xlab = var, ylab = "" )
}

# Adjusted variables to include in cyano models
cyano_vars <- c("sqrt(Nitrate)", "log(SRP)", "Temp", "log(H2O2)", "SpCond", "log(Ammonia)", "log(Turbidity)", "log(TP)", "Days")


# Variables to include in nc-bacteria models
non_cyano_vars <- c(cyano_vars, "pH",  "logParMC", "logChla", "logPhyco")


# Impute SpCond values for nearshore 1 on Sep 2 and Sep 8 with value for nearshore 2

# Change 9/2 value
pcoa_df$SpCond[pcoa_df$Date == "9/2" & pcoa_df$Station == "nearshore1"] <- 
  pcoa_df$SpCond[pcoa_df$Date == "9/2" & pcoa_df$Station == "nearshore2"]
# Change 9/8 value
pcoa_df$SpCond[pcoa_df$Date == "9/8" & pcoa_df$Station == "nearshore1"] <- 
  pcoa_df$SpCond[pcoa_df$Date == "9/8" & pcoa_df$Station == "nearshore2"]

Non-cyano models

get_models <- function(vars, response, dat) {
  models <- list()
  for (var in vars) {
    models[[var]] <- lm(reformulate(termlabels = var, response = response), data = dat)
  }
  return(models)
}
ncbact_pcs <- named_list("PC1", "PC2", "PC3")

# Get the simple linear models for each variable along each PC
non_cyano_models <- lapply(ncbact_pcs, function(x) {
   get_models(non_cyano_vars, x, dat = pcoa_df)
})

# Get rsquared for each model
rsquared <- lapply(non_cyano_models, function(x) {
  lapply(x, function(y) {
    r2 <- summary(y)$r.squared
    return(r2)
  })
})

rsquared_df <- rbind(data.frame(rsquared$PC1), data.frame(rsquared$PC2), data.frame(rsquared$PC3))
rsquared_df$PC <- c("PC1", "PC2", "PC3")

rsquared_df
##   sqrt.Nitrate.  log.SRP.       Temp   log.H2O2.       SpCond log.Ammonia.
## 1    0.34015898 0.1167513 0.01044875 0.006574739 7.410362e-05  0.152346552
## 2    0.21463152 0.4557471 0.82567786 0.158273776 7.386702e-02  0.001826483
## 3    0.05033012 0.1076665 0.02063832 0.017335249 4.510982e-01  0.004945428
##   log.Turbidity.    log.TP.         Days         pH   logParMC    logChla
## 1     0.50543645 0.47435006 0.0556560685 0.39784576 0.30182961 0.43430566
## 2     0.01875346 0.02587116 0.7426635978 0.25433814 0.04398430 0.19320660
## 3     0.12044041 0.16451430 0.0008503211 0.07749959 0.02609096 0.04661143
##      logPhyco  PC
## 1 0.425056608 PC1
## 2 0.059674067 PC2
## 3 0.001428595 PC3
# Get rsquared for each model
pvalue <- lapply(non_cyano_models, function(x) {
  lapply(x, function(y) {
    p <- summary(y)$coefficients[2,4]
    return(p)
  })
})

pvalue_df <- rbind(data.frame(pvalue$PC1), data.frame(pvalue$PC2), data.frame(pvalue$PC3))
pvalue_df$PC <- c("PC1", "PC2", "PC3")

pvalue_df
##   sqrt.Nitrate.     log.SRP.         Temp   log.H2O2.       SpCond
## 1  4.583466e-06 1.227683e-02 4.664099e-01 0.563816708 9.512187e-01
## 2  4.768864e-04 2.953102e-08 5.500437e-21 0.003177153 4.899057e-02
## 3  1.063189e-01 1.645540e-02 3.047689e-01 0.347336859 3.685796e-08
##   log.Ammonia. log.Turbidity.      log.TP.         Days           pH
## 1   0.00386036   2.449999e-09 1.194196e-08 8.901190e-02 9.117569e-07
## 2   0.76124022   3.281703e-01 2.499139e-01 1.190187e-16 1.882576e-04
## 3   0.61680975   1.089803e-02 2.585948e-03 8.357976e-01 5.027727e-02
##       logParMC      logChla     logPhyco  PC
## 1 2.041148e-05 8.088317e-08 1.235285e-07 PC1
## 2 1.317525e-01 9.910694e-04 7.793199e-02 PC2
## 3 2.478855e-01 1.205073e-01 7.881612e-01 PC3

Best models:

pc1_ph <- lm(PC1 ~ pH + Days, data = pcoa_df)
summary(pc1_ph)
## 
## Call:
## lm(formula = PC1 ~ pH + Days, data = pcoa_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.294635 -0.070383  0.000344  0.085555  0.244193 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.1660354  0.4454814  -9.352 2.66e-12 ***
## pH           0.4593039  0.0508065   9.040 7.49e-12 ***
## Days         0.0031160  0.0004867   6.402 6.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1265 on 47 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.6783, Adjusted R-squared:  0.6646 
## F-statistic: 49.55 on 2 and 47 DF,  p-value: 2.657e-12
pc1_turb <- lm(PC1 ~ Turbidity + Days, data = pcoa_df)
summary(pc1_turb)
## 
## Call:
## lm(formula = PC1 ~ Turbidity + Days, data = pcoa_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31893 -0.08260 -0.01263  0.06917  0.37056 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.2316885  0.0551331  -4.202 0.000109 ***
## Turbidity    0.0186560  0.0027340   6.824 1.13e-08 ***
## Days         0.0006826  0.0005381   1.269 0.210477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.155 on 50 degrees of freedom
## Multiple R-squared:  0.511,  Adjusted R-squared:  0.4915 
## F-statistic: 26.13 on 2 and 50 DF,  p-value: 1.706e-08
pc2 <- lm(PC2 ~ Temp, data = pcoa_df)
summary(pc2)
## 
## Call:
## lm(formula = PC2 ~ Temp, data = pcoa_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.149786 -0.019076  0.007641  0.030056  0.113835 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.546834   0.035881  -15.24   <2e-16 ***
## Temp         0.026864   0.001728   15.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05125 on 51 degrees of freedom
## Multiple R-squared:  0.8257, Adjusted R-squared:  0.8223 
## F-statistic: 241.6 on 1 and 51 DF,  p-value: < 2.2e-16
pc3 <- lm(PC3 ~ SpCond + Days, data = pcoa_df)
summary(pc3)
## 
## Call:
## lm(formula = PC3 ~ SpCond + Days, data = pcoa_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.145381 -0.057191 -0.002854  0.057312  0.116025 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.8641631  0.1269292  -6.808 1.20e-08 ***
## SpCond       0.0029764  0.0004268   6.973 6.61e-09 ***
## Days         0.0005084  0.0002486   2.045   0.0462 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06842 on 50 degrees of freedom
## Multiple R-squared:  0.4935, Adjusted R-squared:  0.4732 
## F-statistic: 24.35 on 2 and 50 DF,  p-value: 4.125e-08

Distribution of model residuals:

hist(residuals(pc1_ph))

hist(residuals(pc2))

hist(residuals(pc3))

All distributions look normal enough.

Temporal autocorrelation of model residuals:

par(mfrow = c(3, 3))
# Look at temporal autocorrelation
n1 <- which(pcoa_df$Station == "nearshore1")
n2 <- which(pcoa_df$Station == "nearshore2")
o <- which(pcoa_df$Station == "offshore")

n1_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "nearshore1")
n2_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "nearshore2")
o_ph <- which(!is.na(pcoa_df$pH) & pcoa_df$Station == "offshore")


acf(residuals(pc1_ph)[n1_ph], main = "PC1 ~ pH; nearshore 1")
acf(residuals(pc1_ph)[n2_ph], main = "PC1 ~ pH; nearshore 2")
acf(residuals(pc1_ph)[o_ph], main = "PC1 ~ pH; offshore")


acf(residuals(pc2)[n1], main = "PC2 ~ Temp; nearshore 1")
acf(residuals(pc2)[n2], main = "PC2 ~ Temp; nearshore 2")
acf(residuals(pc2)[o], main = "PC2 ~ Temp; offshore")


acf(residuals(pc3)[n1], main = "PC3 ~ SpCond; nearshore 1")
acf(residuals(pc3)[n2], main = "PC3 ~ SpCond; nearshore 2")
acf(residuals(pc3)[o], main = "PC3 ~ SpCond; offshore")

There is some temporal autocorrelation in the residuals so lets look at the cv error.

par(mfrow = c(3, 3))


pacf(residuals(pc1_ph)[n1_ph], main = "PC1 ~ pH; nearshore 1")
pacf(residuals(pc1_ph)[n2_ph], main = "PC1 ~ pH; nearshore 2")
pacf(residuals(pc1_ph)[o_ph], main = "PC1 ~ pH; offshore")


pacf(residuals(pc2)[n1], main = "PC2 ~ Temp; nearshore 1")
pacf(residuals(pc2)[n2], main = "PC2 ~ Temp; nearshore 2")
pacf(residuals(pc2)[o], main = "PC2 ~ Temp; offshore")


pacf(residuals(pc3)[n1], main = "PC3 ~ SpCond; nearshore 1")
pacf(residuals(pc3)[n2], main = "PC3 ~ SpCond; nearshore 2")
pacf(residuals(pc3)[o], main = "PC3 ~ SpCond; offshore")

Cross validation error, leaving out one time point at a time:

dates <- levels(pcoa_df$Date)
loocv <- rep(0, length(dates))
names(loocv) <- dates

# PC1
for (date in dates) {
  train <- filter(pcoa_df, Date != date)
  test <- filter(pcoa_df, Date == date)
  fit <- lm(PC1 ~ pH + Days, data = train)
  pred <- predict(fit, test)
  loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv[-18])
## [1] 0.01928425
# PC2
for (date in dates) {
  train <- filter(pcoa_df, Date != date)
  test <- filter(pcoa_df, Date == date)
  fit <- lm(PC2 ~  Temp, data = train)
  pred <- predict(fit, test)
  loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv)
## [1] 0.05223402
# PC3
for (date in pcoa_df$Date) {
  train <- filter(pcoa_df, Date != date)
  test <- filter(pcoa_df, Date == date)
  fit <- lm(PC3 ~ SpCond + Days, data = train)
  pred <- predict(fit, test)
  loocv[date] <- mean((test$PC1 - pred)^2)
}
mean(loocv)
## [1] 0.04727986

Microcystis and Cyanobacterial associates

library(psych)

n = min(sample_sums(erie))
thresh = 0.001

# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(ncbacteria)/nsamples(ncbacteria)

# Prune low abundance taxa using thresh as mean relative abundance
nc_prune <- 
  ncbacteria %>%
    prune_taxa(tax_mean > thresh*n, .)

mc <- 
  erie_scale %>%
  subset_taxa(Species == "Otu00005")

# Calculate correlation between microcystis and nc-bacteria
mc_corrs <-  corr.test(
    x = t(otu_table(mc)), 
    y = t(otu_table(nc_prune)),
    method = "spearman",
    use = "complete",
    adjust = "fdr"
)

which_sigs <- which(mc_corrs$p < 0.05)  
sig_otus <- colnames(mc_corrs$p)[which_sigs]
mc_corrs_dat <- data.frame(tax_table(nc_prune)) %>% filter(Species %in% sig_otus)
mc_corrs_dat$Rho <- mc_corrs$r[which_sigs]

mc_corrs_dat
##     Kingdom          Phylum               Class              Order
## 1  Bacteria  Proteobacteria  Betaproteobacteria    Burkholderiales
## 2  Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 3  Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 4  Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 5  Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 6  Bacteria  Proteobacteria Alphaproteobacteria   Rhodospirillales
## 7  Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 8  Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 9  Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 10 Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 11 Bacteria   Bacteroidetes          Cytophagia       Cytophagales
## 12 Bacteria Verrucomicrobia    OPB35_soil_group       unclassified
## 13 Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 14 Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 15 Bacteria  Proteobacteria Alphaproteobacteria    Rhodobacterales
## 16 Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 17 Bacteria  Actinobacteria      Actinobacteria    Actinomycetales
## 18 Bacteria  Proteobacteria  Betaproteobacteria    Burkholderiales
## 19 Bacteria   Bacteroidetes    Sphingobacteriia Sphingobacteriales
## 20 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 21 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 22 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 23 Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 24 Bacteria Verrucomicrobia      Spartobacteria  Spartobacteriales
## 25 Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 26 Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 27 Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 28 Bacteria Armatimonadetes       Armatimonadia    Armatimonadales
## 29 Bacteria  Proteobacteria Alphaproteobacteria    Caulobacterales
## 30 Bacteria     Chloroflexi        Chloroflexia     Chloroflexales
## 31 Bacteria Verrucomicrobia            Opitutae         Opitutales
## 32 Bacteria Verrucomicrobia    OPB35_soil_group       unclassified
## 33 Bacteria   Bacteroidetes       Flavobacteria   Flavobacteriales
## 34 Bacteria  Proteobacteria Gammaproteobacteria    Alteromonadales
## 35 Bacteria Verrucomicrobia            Opitutae         Opitutales
## 36 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 37 Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 38 Bacteria  Actinobacteria      Actinobacteria   Acidimicrobiales
## 39 Bacteria  Proteobacteria Alphaproteobacteria    Rhodobacterales
## 40 Bacteria   Bacteroidetes    Sphingobacteriia Sphingobacteriales
## 41 Bacteria Verrucomicrobia    Verrucomicrobiae Verrucomicrobiales
## 42 Bacteria        Chlorobi           Chlorobia       Chlorobiales
## 43 Bacteria  Actinobacteria      Actinobacteria   Acidimicrobiales
## 44 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 45 Bacteria  Proteobacteria Deltaproteobacteria       Myxococcales
## 46 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 47 Bacteria  Planctomycetes    Planctomycetacia   Planctomycetales
## 48 Bacteria   Bacteroidetes     Sphingobacteria Sphingobacteriales
## 49 Bacteria  Actinobacteria      Actinobacteria   Acidimicrobiales
## 50 Bacteria   Bacteroidetes    Sphingobacteriia Sphingobacteriales
## 51 Bacteria Verrucomicrobia      Spartobacteria  Spartobacteriales
## 52 Bacteria  Proteobacteria  Betaproteobacteria               betV
##                  Family                       Genus        Rank7
## 1                  betI                      betI-A      Lhab-A1
## 2                   acI                       acI-C       acI-C2
## 3                  bacI                      bacI-A      bacI-A1
## 4                   acI                       acI-A unclassified
## 5                   acI                       acI-A       acI-A6
## 6      Acetobacteraceae                  Roseomonas         <NA>
## 7     Planctomycetaceae                unclassified         <NA>
## 8                   acI                       acI-A       acI-A3
## 9                  bacV                unclassified unclassified
## 10                 bacV                unclassified unclassified
## 11        Cytophagaceae                unclassified unclassified
## 12         unclassified                unclassified         <NA>
## 13                 bacV                unclassified unclassified
## 14    Planctomycetaceae                   Pirellula         <NA>
## 15     Rhodobacteraceae                 Rhodobacter         <NA>
## 16                  acI                       acI-C       acI-C2
## 17                Luna1                     Luna1-A     Luna1-A4
## 18       Comamonadaceae                unclassified unclassified
## 19           env.OPS_17                unclassified unclassified
## 20                 bacI                unclassified unclassified
## 21               bacIII                    bacIII-A unclassified
## 22                 bacI                unclassified unclassified
## 23    Planctomycetaceae                Planctomyces         <NA>
## 24   Spartobacteriaceae CandidatusXiphinematobacter       verI-A
## 25                bacII                     bacII-A     Flavo-A1
## 26                bacII                     bacII-A unclassified
## 27    Planctomycetaceae                Planctomyces         <NA>
## 28     Armatimonadaceae                 Armatimonas unclassified
## 29     Caulobacteraceae            Phenylobacterium         <NA>
## 30       Roseiflexaceae                 Roseiflexus         <NA>
## 31          Opitutaceae                unclassified unclassified
## 32         unclassified                unclassified         <NA>
## 33                 bacV                unclassified unclassified
## 34     Alteromonadaceae                 BD1-7_clade         <NA>
## 35          Opitutaceae                unclassified unclassified
## 36                bacVI                unclassified unclassified
## 37    Planctomycetaceae                   Pirellula         <NA>
## 38                 acIV                      acIV-D        Iamia
## 39     Rhodobacteraceae                unclassified         <NA>
## 40 NS11-12_marine_group                unclassified unclassified
## 41  Verrucomicrobiaceae                unclassified         <NA>
## 42                OPB56                unclassified         <NA>
## 43                 acIV                unclassified unclassified
## 44                bacIV                     bacIV-B        Aquir
## 45            0319-6G20                unclassified         <NA>
## 46                bacVI                unclassified unclassified
## 47    Planctomycetaceae             Blastopirellula         <NA>
## 48                 bacI                unclassified unclassified
## 49                 acIV                      acIV-C unclassified
## 50 NS11-12_marine_group                unclassified         <NA>
## 51   Spartobacteriaceae CandidatusXiphinematobacter unclassified
## 52               betV-A                     betV-A1 unclassified
##           Rank8  Species        Rho
## 1  unclassified Otu00002 -0.3972260
## 2  unclassified Otu00006  0.6541005
## 3  unclassified Otu00008 -0.4389629
## 4  unclassified Otu00010 -0.4214664
## 5  unclassified Otu00011 -0.4918456
## 6          <NA> Otu00012  0.6951876
## 7          <NA> Otu00014  0.6052109
## 8  unclassified Otu00016 -0.4071931
## 9  unclassified Otu00017 -0.4733574
## 10 unclassified Otu00023 -0.3285210
## 11 unclassified Otu00024  0.7050750
## 12         <NA> Otu00025 -0.3257464
## 13 unclassified Otu00028 -0.3204986
## 14         <NA> Otu00029  0.5718519
## 15         <NA> Otu00030  0.8631638
## 16 unclassified Otu00032  0.6418170
## 17 unclassified Otu00036 -0.5039319
## 18 unclassified Otu00041  0.6678786
## 19 unclassified Otu00043  0.3989192
## 20 unclassified Otu00047 -0.6929731
## 21 unclassified Otu00048 -0.5743878
## 22 unclassified Otu00052  0.6502580
## 23         <NA> Otu00054  0.5176195
## 24 unclassified Otu00056  0.4261430
## 25 unclassified Otu00057  0.4464391
## 26 unclassified Otu00070 -0.6599055
## 27         <NA> Otu00073  0.6197799
## 28 unclassified Otu00077 -0.6003231
## 29         <NA> Otu00078  0.8491539
## 30         <NA> Otu00081  0.5789068
## 31 unclassified Otu00084 -0.4491541
## 32         <NA> Otu00092  0.4823769
## 33 unclassified Otu00094 -0.3946931
## 34         <NA> Otu00095  0.3467692
## 35 unclassified Otu00099 -0.3435773
## 36 unclassified Otu00113 -0.4906489
## 37         <NA> Otu00115  0.5510557
## 38 unclassified Otu00123  0.3381007
## 39         <NA> Otu00142 -0.4395983
## 40 unclassified Otu00144  0.3723329
## 41         <NA> Otu00153  0.5679166
## 42         <NA> Otu00159  0.7065858
## 43 unclassified Otu00168  0.5823920
## 44 unclassified Otu00175  0.3689566
## 45         <NA> Otu00189  0.3992416
## 46 unclassified Otu00208  0.3326477
## 47         <NA> Otu00213  0.6243399
## 48 unclassified Otu00225  0.5072148
## 49 unclassified Otu00232 -0.5309842
## 50         <NA> Otu00245  0.6375114
## 51 unclassified Otu00290  0.5884746
## 52 unclassified Otu01489  0.8583584
# Calculate correlation between phycocyanin and nc-bacteria  
cyano_corrs <-  corr.test(
    x = data.frame(pcoa_df$Phycocyanin), 
    y = t(otu_table(nc_prune)),
    method = "spearman",
    use = "complete",
    adjust = "fdr"
)

which_sigs <- which(cyano_corrs$p < 0.05)  
sig_otus <- colnames(cyano_corrs$p)[which_sigs]


cyano_corrs_dat <- data.frame(tax_table(nc_prune)) %>% filter(Species %in% sig_otus)
cyano_corrs_dat$Rho <- cyano_corrs$r[which_sigs]

Figure 5: Taxonomic group OTU dynamics

nc_prune %>% 
  tax_glom(taxrank = "Family") %>% 
  psmelt() %>%
  group_by(Family) %>%
  summarise(mean = mean(Abundance)) %>%
  arrange(desc(mean))
## # A tibble: 44 x 2
##               Family      mean
##               <fctr>     <dbl>
## 1                acI 3390.6604
## 2               bacI 1063.3019
## 3               betI  686.2264
## 4  Planctomycetaceae  647.2075
## 5               bacV  467.3962
## 6              betIV  447.0189
## 7              betII  271.0377
## 8              acTH1  241.8302
## 9               acIV  235.3396
## 10            bacIII  229.1321
## # ... with 34 more rows

AcI is the most abundant clade

groups <- c("acI", "bacI", "betI", "Planctomycetaceae", "bacV")

nc_melt <- nc_prune %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt() %>%
  order_dates

group_otu_plots <- lapply(groups, function(x) {
  group_df <- 
    nc_melt %>%
      filter(Family == x)
  
  group_otus <- unique(group_df$Species)
  
  group_plots <- lapply(group_otus, function(y) {
    df_otu <- filter(group_df, OTU == y)
    plot <- plot_otus(df = df_otu, otu = y, taxrank = "Genus")
  })
  return(group_plots)
})


do.call(grid.arrange, group_otu_plots[[1]])

do.call(grid.arrange, group_otu_plots[[2]])

do.call(grid.arrange, group_otu_plots[[3]])

do.call(grid.arrange, group_otu_plots[[4]])

do.call(grid.arrange, group_otu_plots[[5]])

# AcI plot
aci_plots <- group_otu_plots[[1]]

grid.arrange(
  aci_plots[[3]], aci_plots[[4]], aci_plots[[5]], aci_plots[[8]], 
  aci_plots[[1]], aci_plots[[2]], aci_plots[[6]], aci_plots[[7]], 
  station_legend,
  ncol = 4, nrow = 3
)